* Macros to setup and estimate automobile model and mode choice model;
* Produced by Mark Kutzbach;

* macro to run state portion of program;
%macro modechoice_setup;

/* run mode choice setup ----------------------------------------------------- */
%if (&run_mode_setup.=1) %then %do;
    
    * delete old file if starting out;
    %if (&metord.=&metord_start.) %then %do;
    proc datasets library=OUTDATA;
        delete modechoice;
    run;
    %end;
    
    * identify list of piks for estimation;
    * where restrictions ensure that worker is in metro area;
    data piklist (keep=pik) 
           pikfile 
           (keep=pik pnc puid pseq pwt
                 age esr qcow
                 qcommute qcarpool w_stfid
                 female
                 grp_hisp grp_white grp_black grp_other grp_c)
            ;
        length w_stfid $11 w_st $2;
        set CENLONG.pik_per_uniqpik
        (keep=pik pnc puid pseq pwt
              qage esr qcow
              qpowst qpowco qpowtract
              qcommute qcarpool
              qsex qspanx qracex
        where=(
            ((esr='1') and ('1' le qcow le '4')) and
            (substr(qpowst,2,2)="&stcode.") and
            (qpowco in (&cty.)) and
            (input(qage,3.) ge 0&agemin.) and
            (input(qage,3.) le 0&agemax.)
            )
        rename=(pwt=pwt_string)
        );
        * output pik list for household construction;
        output piklist;
        * construct variables;
        pwt=input(pwt_string,5.);
        age=input(qage,3.);
        female=0;
        if (qsex=2) then do;
            female=1;
        end;
        grp_hisp=0;
        if (qspanx>1) then do;
            grp_hisp=1;
        end;
        grp_white=0;
        if (qracex=1 and grp_hisp=0) then do;
            grp_white=1;
        end;
        grp_black=0;
        if (qracex=2 and grp_hisp=0) then do;
            grp_black=1;
        end;
        * groups;
        grp_other=0;
        grp_c=4;
        if (grp_white=1) then grp_c=1;
        if (grp_black=1) then grp_c=2;
        if (grp_hisp=1) then grp_c=3;
        if (grp_c=4) then grp_other=1;
        * workplace geography;
        w_st=substr(qpowst,2,2);
        w_stfid=cat(of w_st qpowco qpowtract);
        * output records for analysis sample;
        output pikfile;
    run;
    * pull cpr;
    data cpruse (keep=pik h_stfid huid cpr_geolen address_year);
        set OUTDATA.cpr_&met.
        (keep=pik address_year geocode huid where=(address_year=2000));
        h_stfid=substr(geocode,1,11);
        cpr_geolen=length(h_stfid);
        if (cpr_geolen ge 11);
    run;
    * merge with cpr (length of geocode to use)?;
    data piklist;
        merge piklist (in=in_pik)
              cpruse (in=in_cpr)
              ;
        by pik;
        if (in_pik);
        if (in_cpr);
    run;
    * create list of housing units (include tempcnt so that summary works);
    proc summary data=piklist nway;
        class huid pik;
        var address_year;
        output out=hulist n(address_year)=tempcnt;
    run;
    proc sort data=cpruse;
        by huid pik;
    run;
    * create cartesian product of housing units and people;
    proc sql nowarn;
        create table hupiklist as
            select hulist.huid, hulist.pik, hulist.tempcnt,
                   cpruse.huid, cpruse.pik as pik_mem
            from hulist, cpruse
            where hulist.huid=cpruse.huid
        ;
    quit;
    * summarize the distribution of household sizes (where >1);
    proc summary data=hupiklist;
        class pik;
        var tempcnt;
        output out=hupiklistcount n(tempcnt)=hucount;
    run;
    proc freq data=hupiklistcount;
        title "distribution of household sizes in &met.";
        table hucount;
    run;        
    * list of all other people in housing units;
    data hupiklist;
        retain hucount;
        length hucount 5.;
        set hupiklist;
        by huid;
        if (first.huid) then hucount=0;
        * do not include worker PIK;
        if (pik ne pik_mem);
        * counte each other member;
        hupik=1;
        hucount+1;
    run;

    * summarize EHF for earnings vars;
    libname SNAP "&dirsnap./ehf/&st." access=readonly;
    proc summary data=SNAP.ehf_&st._phf 
        (keep=pik e58 e59 e60 e61 e62) nway;
        class pik;
        var e58 e59 e60 e61 e62;
        output out=ehfphf (drop=_type_ _freq_) sum=;
    run;
    proc sort data=hupiklist;
        by pik_mem;
    run;
    data hupiklist (keep=pik earntotall earner huid hupik);
        merge hupiklist (in=in_hu)
              ehfphf (keep=pik e58 e59 e60 e61 rename=(pik=pik_mem))
              ;
        by pik_mem;
        if (in_hu);
        earntotall=max(0,e58)+max(0,e59)+max(0,e60)+max(0,e61);
        if (earntotall>0) then earner=1;
        else earner=0;
        * only keep up to 10 people in each household (random);
        * this prevents some households from having massive earnings;
        if (hucount<10);
    run;
    * summarize at HUID level across all other members;
    proc summary data=hupiklist nway;
        class pik huid;
        var hupik earntotall earner;
        output out=hupiklist (drop=_TYPE_ _FREQ_) 
        sum(hupik)=pikcnt sum(earntotall)=huearn sum(earner)=huearners;
    run;
    * set up individual data and merge with CPR, hu, household, and tract data;
    *  merge person file with ehf (should also add ecf here);
    data pikfilehu (drop=e58 e59 e60 e61 e62);
        merge piklist (in=in_pik)
              pikfile
              ehfphf (keep=pik e58 e59 e60 e61 e62)
              ;
        by pik;
        * sample begins as those reporting long form jobs who are in CPR counties;
        if (in_pik);
        * total earnings of worker in last year;
        earntotall=max(0,e58)+max(0,e59)+max(0,e60)+max(0,e61);
        * determine if worker should be considered an earner (same definition);
        if (earntotall>0) then earner=1;
        else earner=0;
        * determine if worker should be used in estimation sample;
        * must have continuous earnings for one year and this quarter;
        * this requirement is somewhat analagous to the job separator sample;
        *   at the time of separation;
        if ((e58>0) and (e59>0) and (e60>0) and (e61>0) and (e62>0)) then sample_employed=1;
        else sample_employed=0;
    run;
    proc freq data=pikfilehu;
        title "share of eligible workers that have lehd earnings in &met.";
        table earner;
    run;
    *  merge to huid info, other household earnings;
    proc sort data=pikfilehu;
        by pik huid;
    run;
    data pikfilehu;
        merge pikfilehu (in=in_pik)
              hupiklist
              ;
        by pik huid;
        if (in_pik);
        * prepare variables for estimation (avoid divide by zero);
        huearn=max(0,huearn)+earntotall;
        huearners=max(0,huearners)+earner;
        pikcnt=max(0,pikcnt)+1;
    run;
    *  merge on proximity info;
    proc sort data=pikfilehu (where=(earner=1));
        by w_stfid h_stfid;
    run;
    data pikfilehu;
        merge pikfilehu (in=in_pik)
              OUTDATA.proxmeas_&met. (in=in_prox
              keep=d_stfid o_stfid dist auto tran
              rename=(d_stfid=w_stfid o_stfid=h_stfid))
              ;
        by w_stfid h_stfid;
        if (in_pik);
        * identify those commutes with no tract level match;
        if (in_prox) then sample_prox=1;
        else sample_prox=0;
    run;
    *  merge to household info (number of automobiles);
    proc sort data=pikfilehu;
        by puid pseq;
    run;
    data pikfilehu (drop=state county);
        merge pikfilehu (in=in_pik rename=(puid=mafid pseq=rseq))
              CENLONG.hu (in=in_hu keep=mafid rseq state county sautos
                where=(input(state,2.)=&stcode.))
                ;
        by mafid rseq;
        if (in_pik);
        * check if have household info;
        if (in_hu) then sample_hu=1;
        else sample_hu=0;
    run;
    *  merge to geographic info;
    proc sort data=pikfilehu;
        by h_stfid;
    run;
    data pikfilehu (drop=stmet_prox);
        merge pikfilehu (in=in_pik)
              OUTDATA.census2000_&met. (in=in_tract
               keep=h_stfid pop_sq_mile com_pub_rate poverty_rate owner_rate year_built_med)
              OUTDATA.proxmeassum_all
              (keep=h_stfid tract_transit_mpo tract_transit_census stmet
               rename=(stmet=stmet_prox) where=(stmet_prox="&stmet."))
              ;
        by h_stfid;
        if (in_pik);
        * check if have tract info;
        if (in_tract) then sample_tract=1;
        else sample_tract=0;
        * dependent variables;
        if (sautos>'0') then autohave=1;
        else autohave=0;
        if ((qcommute='01') or (qcommute='07')  or (qcommute='08')) then transit=0;
        else transit=1;
        * ordered logit variable;
        auto_ord=0;
        if (sautos='1') then auto_ord=1;
        if (sautos='2') then auto_ord=2;
        if (sautos>'2') then auto_ord=3;
        * age age_c;
        age_c=0;
        if (age>=20 and age<25) then do;
            age_c=1;
            age_20_24=1;
        end;
        else age_20_24=0;
        if (age>=25 and age<35) then do;
            age_c=2;
            age_25_34=1;
        end;
        else age_25_34=0;
        if (age>=35 and age<45) then do;
            age_c=3;
            age_35_44=1;
        end;
        else age_35_44=0;
        if (age>=45 and age<55) then do;
            age_c=4;
            age_45_54=1;
        end;
        else age_45_54=0;
        if (age>=55 and age<65) then do;
            age_c=5;
            age_55_64=1;
        end;
        else age_55_64=0;
        * normalized variables;
        earn_ln=log(earntotall);
        huearn_ln=log(huearn);
        huearnperwork=huearn/huearners;
        huearnperwork_ln=log(huearnperwork);
        auto_earner=auto_ord/(huearners);
        auto_pik=auto_ord/(pikcnt);
        * auto to transit travel time;
        * if (auto=0) then ratio_tran_auto=1;
        * else ratio_tran_auto=tran/auto;
        if (auto=0) then ratio_tran_auto=0;
        else ratio_tran_auto=(tran-auto)/(0.5*(tran+auto));
        * variable for tabulating sample composition;
        sample_c=0;
        if (sample_hu ne 1) then sample_c=1;
        if (qcommute eq '11') then sample_c=2;
        if (sample_tract ne 1) then sample_c=3;
        if (sample_prox ne 1) then sample_c=4;
        if (tract_transit_census ne 1) then sample_c=5;
        if (tract_transit_mpo ne 1) then sample_c=6;
        if (sample_employed ne 1) then sample_c=7;
        if (earntotall lt 15000 or earntotall gt 40000) then sample_c=8;
    run;

    * append to national file;
    proc append base=OUTDATA.modechoice data=pikfilehu;
    run;
    
%end; * end mode choice setup;

%mend modechoice_setup;


* estimation phase;
%macro modechoice_est;
/* run mode choice estimation ------------------------------------------------------- */
%if (&run_mode_est.=1) %then %do;

    * tabulate restrictions;
    proc freq data=OUTDATA.modechoice;
        title 'Table B1: sample restrictions, unweighted';
        table sample_c;
    run;

    * tabulate restrictions;
    proc freq data=OUTDATA.modechoice;
        title 'sample restrictions, weighted';
        weight pwt;
        table sample_c;
    run;
        
    * create estimation file;
    data pikfilehu;
        set OUTDATA.modechoice 
        (where=(sample_c=0));
        * already restricting on live and work in study area;
        * and employed and in age range (see where long form read in);
    run;                  
                    
    * summary statistics (check out missing values);
    proc freq data=pikfilehu;
        title 'Table B2: journey-to-work';
        table auto_ord*transit sautos autohave*qcommute qcarpool*autohave auto_ord*huearners /missing;
    run;
    * summary statistics (check out missing values);
    proc freq data=pikfilehu;
        title 'journey-to-work, weighted';
        weight pwt;
        table auto_ord*transit /missing;
    run;
    proc means data=pikfilehu;
        var autohave transit
            earntotall huearn earn_ln huearn_ln huearners huearnperwork pikcnt
            pop_sq_mile com_pub_rate poverty_rate owner_rate year_built_med /*build_age_med*/
            age_20_24 age_35_44 age_45_54 age_55_64
            female grp_hisp grp_white grp_black grp_other
            dist tran auto ratio_tran_auto auto_earner auto_pik
            ;
    run;
        
    * Table B3: run auto estimation (out gives prob of level or above);
    proc logistic data=pikfilehu descending
                  outmodel=OUTDATA.model_auto;
        title 'Table B3, Column 1: ORDERED auto choice model';
        model auto_ord=&autovars.;
        ods output ParameterEstimates = OUTDATA.param_auto;
        score out=OUTDATA.predict_auto;
    run;

    * use this portion for forcasting;
    * calculate predicted auto count (drop=auto_ord);
    proc logistic inmodel=OUTDATA.model_auto;
        score data=OUTDATA.predict_auto
        out=OUTDATA.predict_auto_outsample;
    run;
    * expected autos, given that has an auto;
    data OUTDATA.predict_auto2;
        set OUTDATA.predict_auto_outsample;
        pauto=P_1+P_2+P_3;
        predict_wauto_cnt=1*(P_1/pauto)+
                          2*(P_2/pauto)+
                          3*(P_3/pauto);
        label 
            P_1="Pred. Prob. auto_ord=1"
            P_2="Pred. Prob. auto_ord=2"
            P_3="Pred. Prob. auto_ord=3"
            pauto="Pred. Prob. haveauto"
            predict_wauto_cnt="Expected auto count"
        ;
    run;
    * verify auto predictions;
    proc means data=OUTDATA.predict_auto2;
        title 'Verify auto predictions similar to observed';
        class auto_ord;
        var P_1 P_2 P_3 pauto predict_wauto_cnt;
    run;
    * new step - merge in auto predictions to replace observed values;
    proc sort data=pikfilehu;
        by pik;
    run;
    proc sort data=OUTDATA.predict_auto2;
        by pik;
    run;
    data pikfilehu;
        merge pikfilehu (in=in_pik)
              OUTDATA.predict_auto2 (in=in_auto keep=pik pauto predict_wauto_cnt)
              ;
        by pik;
        if (in_pik and in_auto);
        * replace earlier values with new values;
        auto_earner=predict_wauto_cnt/(huearners);
        auto_pik=predict_wauto_cnt/(pikcnt);
        * pauto will weight wauto results, 1-pauto will weight 0auto results;
        p0auto=1-pauto;
    run;

    * run transit estimation, for those without auto;
    proc logistic data=pikfilehu
                  outmodel=OUTDATA.model_tran_0auto;
        title 'ORDERED transit choice model, non-auto';
        model transit(event='1')=&tranvars_0auto.
              / ctable pprob=(.25 .5 .75);
        ods output ParameterEstimates = OUTDATA.param_tran_0auto;
        * weight results to emphasize non-auto households;
        weight p0auto /norm;
        score out=OUTDATA.predict_tran_0auto;
    run;

    * run transit estimation, for those with auto;
    proc logistic data=pikfilehu
                  outmodel=OUTDATA.model_tran_wauto;
        title 'ORDERED transit choice model, with auto';
        model transit(event='1')=&tranvars_wauto.
              / ctable pprob=(.25 .5 .75);
        ods output ParameterEstimates = OUTDATA.param_tran_wauto;
        * weight results to emphasize auto households;
        weight pauto /norm;
        score out=OUTDATA.predict_tran_wauto;
    run;

    * consider a logistic model that is based one-stage setup (no auto choice);
    proc logistic data=pikfilehu
                  outmodel=OUTDATA.model_tran_all;
        title 'ORDERED transit choice model, all, one-stage';
        model transit(event='1')=&tranvars_wauto.
              / ctable pprob=(.25 .5 .75);
        ods output ParameterEstimates = OUTDATA.param_tran_all;
        score out=OUTDATA.predict_tran_all;
    run;
        
    * use this portion for forcasting;
    * calculate predicted transit given no autos (drop=tran);
    proc logistic inmodel=OUTDATA.model_tran_0auto;
        score data=OUTDATA.predict_tran_0auto
        out=OUTDATA.predict_tran_0auto_outsample;
    run;
    * calculate predicted transit given has autos;
    proc logistic inmodel=OUTDATA.model_tran_wauto;
        score data=OUTDATA.predict_tran_wauto
        out=OUTDATA.predict_tran_wauto_outsample;
    run;
    * calculate predicted transit for one-stage;
    proc logistic inmodel=OUTDATA.model_tran_all;
        score data=OUTDATA.predict_tran_all
        out=OUTDATA.predict_tran_all_outsample;
    run;
    * verify transit predictions;
    proc means data=OUTDATA.predict_tran_0auto_outsample;
        title 'Verify transit predictions for no auto similar to observed';
        class transit;
        var P_1;
    run;
    * verify transit predictions;
    proc means data=OUTDATA.predict_tran_wauto_outsample;
        title 'Verify transit predictions for with auto similar to observed';
        class transit;
        var P_1;
    run;
    * verify transit predictions;
    proc means data=OUTDATA.predict_tran_all_outsample;
        title 'Verify transit predictions for one-stage similar to observed';
        class transit;
        var P_1;
    run;

    * find means;
    proc means data=pikfilehu (where=(auto_ord=0));
        title 'means with no auto';
        var huearnperwork_ln
            pop_sq_mile com_pub_rate poverty_rate owner_rate year_built_med
            age_20_24 age_35_44 age_45_54 age_55_64
            female grp_black grp_hisp grp_other
            earn_ln ratio_tran_auto
            ;
    run;
    proc means data=pikfilehu (where=(auto_ord>0));
        title 'means with at least one auto';
        var auto_ord
            huearnperwork_ln
            auto_earner auto_pik
            pop_sq_mile com_pub_rate poverty_rate owner_rate year_built_med
            age_20_24 age_35_44 age_45_54 age_55_64
            female grp_black grp_hisp grp_other
            earn_ln ratio_tran_auto
            ;
    run;
    proc means data=pikfilehu;
        title 'means for all, for one-stage model';
        var auto_ord
            huearnperwork_ln
            auto_earner auto_pik
            pop_sq_mile com_pub_rate poverty_rate owner_rate year_built_med
            age_20_24 age_35_44 age_45_54 age_55_64
            female grp_black grp_hisp grp_other
            earn_ln ratio_tran_auto
            ;
    run;
    * demean interacted variables;
    proc summary data=pikfilehu nway;
        var ratio_tran_auto earn_ln;
        weight p0auto;
        output out=pikfilehu_sum_noauto (drop=_type_ _freq_)
            mean(ratio_tran_auto)=ratio_mean_noauto
            mean(earn_ln)=earn_ln_noauto;
    run;
    proc summary data=pikfilehu nway;
        var ratio_tran_auto earn_ln;
        weight pauto;
        output out=pikfilehu_sum_wiauto (drop=_type_ _freq_)
            mean(ratio_tran_auto)=ratio_mean_wiauto
            mean(earn_ln)=earn_ln_wiauto;
    run;
    proc summary data=pikfilehu nway;
        var ratio_tran_auto earn_ln;
        weight pauto;
        output out=pikfilehu_sum_all (drop=_type_ _freq_)
            mean(ratio_tran_auto)=ratio_mean_all
            mean(earn_ln)=earn_ln_all;
    run;
    * unfortunately, this only merges means onto first row;
    data pikfilehu;
        merge pikfilehu (in=in_pikfile
                  rename=(ratio_tran_auto=ratio_tran_auto_raw
                          earn_ln=earn_ln_raw))
              pikfilehu_sum_noauto (keep=ratio_mean_noauto earn_ln_noauto)
              pikfilehu_sum_wiauto (keep=ratio_mean_wiauto earn_ln_wiauto)
              pikfilehu_sum_all (keep=ratio_mean_all earn_ln_all)
              ;
        if (in_pikfile);
    run;
    * without the retain here, means only appear on first row (after merge);
    data pikfilehu;
        length ratio_mean_noauto ratio_mean_wiauto ratio_mean_all 
           earn_ln_noauto earn_ln_wiauto earn_ln_all 
           t_ratio_mean_noauto t_ratio_mean_wiauto t_ratio_mean_all 
           t_earn_ln_noauto t_earn_ln_wiauto t_earn_ln_all 8.;
        retain t_ratio_mean_noauto t_ratio_mean_wiauto t_ratio_mean_all 
               t_earn_ln_noauto t_earn_ln_wiauto t_earn_ln_all;
        set pikfilehu;
        if (t_ratio_mean_noauto=.) then do;
            t_ratio_mean_noauto=ratio_mean_noauto;
            t_ratio_mean_wiauto=ratio_mean_wiauto;
            t_ratio_mean_all=ratio_mean_all;
            t_earn_ln_noauto=earn_ln_noauto;
            t_earn_ln_wiauto=earn_ln_wiauto;
            t_earn_ln_all=earn_ln_all;
        end;
        ratio_mean_noauto=t_ratio_mean_noauto;
        ratio_mean_wiauto=t_ratio_mean_wiauto;
        ratio_mean_all=t_ratio_mean_all;
        earn_ln_noauto=t_earn_ln_noauto;
        earn_ln_wiauto=t_earn_ln_wiauto;
        earn_ln_all=t_earn_ln_all;
        * demean;
        de_ratio_tran_noauto=ratio_tran_auto_raw-ratio_mean_noauto;
        de_ratio_tran_wiauto=ratio_tran_auto_raw-ratio_mean_wiauto;
        de_ratio_tran_all=ratio_tran_auto_raw-ratio_mean_all;
        de_earn_ln_noauto=earn_ln_raw-earn_ln_noauto;
        de_earn_ln_wiauto=earn_ln_raw-earn_ln_wiauto;
        de_earn_ln_all=earn_ln_raw-earn_ln_all;
    run;
    proc means data=pikfilehu;
        title 'demeaned means with probability of no auto weights';
        weight p0auto;
        var de_earn_ln_noauto de_ratio_tran_noauto
            ;
    run;
    proc means data=pikfilehu;
        title 'demeaned means with probability of at least one auto weights';
        weight pauto;
        var de_earn_ln_wiauto de_ratio_tran_wiauto
            ;
    run;    
    proc means data=pikfilehu;
        title 'demeaned means for all, one-stage, no weights';
        var de_earn_ln_all de_ratio_tran_all
            ;
    run;    
    data OUTDATA.pikfilehu_1000;
        set pikfilehu (obs=1000);
    run;
    * Table B3: estimate demeaned results;
    proc logistic data=pikfilehu (rename=(de_earn_ln_noauto=earn_ln de_ratio_tran_noauto=ratio_tran_auto))
                  outmodel=OUTDATA.model_tran_0auto_demean;
        title 'Table B3, Column 2: ORDERED transit choice model, non-auto, demeaned';
        model transit(event='1')=&tranvars_0auto.
              / ctable pprob=(.25 .5 .75);
        ods output ParameterEstimates = OUTDATA.param_tran_0auto_demean;
        weight p0auto /norm;
        score out=OUTDATA.predict_tran_0auto_demean;
    run;
    proc logistic data=pikfilehu (rename=(de_earn_ln_wiauto=earn_ln de_ratio_tran_wiauto=ratio_tran_auto))
                  outmodel=OUTDATA.model_tran_wauto_demean;
        title 'Table B3, Column 3: ORDERED transit choice model, with auto, demeaned';
        model transit(event='1')=&tranvars_wauto.
              / ctable pprob=(.25 .5 .75);
        ods output ParameterEstimates = OUTDATA.param_tran_wauto_demean;
        weight pauto /norm;
        score out=OUTDATA.predict_tran_wauto_demean;
    run;
    proc logistic data=pikfilehu (rename=(de_earn_ln_all=earn_ln de_ratio_tran_all=ratio_tran_auto))
                  outmodel=OUTDATA.model_tran_all_demean;
        title 'ORDERED transit choice model, all one-stage, demeaned';
        model transit(event='1')=&tranvars_wauto.
              / ctable pprob=(.25 .5 .75);
        ods output ParameterEstimates = OUTDATA.param_tran_all_demean;
        weight pauto /norm;
        score out=OUTDATA.predict_tran_all_demean;
    run;

%end; * end mode choice estimation;

%mend modechoice_est;

